rm(list = ls())

# Purpose: Calculate semantic coherence

# Reference: Mimno D, Wallach H, Talley E, Leenders M, McCallum A (2011). “Optimizing semantic
# coherence in topic models.” In Proceedings of the Conference on Empirical Methods in
# Natural Language Processing, pp. 262–272. Association for Computational Linguistics.

# Formula: Let D(v, v') be the number of times that words v and v' appear together in a document. Then for a list of the M most
# probable words in topic k, the semantic coherence for topic k is given as

#    C_k = \sum_{i=2}^{M} \sum_{j=1}^{i-1} log ( \frac{ D(v_i, v_j)+1 }{ D(v_j) } )


### PARAMETERS ########################
no.words <- as.numeric(args[1]) # max is 50
topic.model <- as.numeric(args[2])
#######################################

cat(paste("Calculating coherence for model",topic.model,"using top",no.words,"words\n"))

### PATHS ##############################
estimated.topics.path <- "./data"
word.id.file <- "./data/word_ids.dat"
dtm.input.file <- "./data/data-mult.dat"
output.file <- paste0("./data/semantic_coherence_model_",topic.model,"_top_",no.words,"_words.csv")
#######################################


# Read document file
# ------------------
# Note: 
# - Each row is a document
# - First integer is the document ID
# - Other elements are word counts separated by spaces in the format 'word ID:count'
docs <- scan(dtm.input.file, what=character(), sep="\n")

# Read word ID table
# ------------------
words <- read.table(word.id.file, sep="\t", row.names=1) # use word labels in first column as row names for easier indexing
names(words) <- c("freq","id")
#rownames(words) <- words$word 


# Read topic results and loop through word pairs to calculate cohesion
# --------------------------------------------------------------------

# Print output
cat(paste("\nCalculating coherence for model:",topic.model,"\n"))

# read data
fname <- file.path(estimated.topics.path, paste0(topic.model, "-topics_100_iterations_dtm_results/estimates"), paste0(topic.model, "_topics_avg_word_probs.csv"))
topics <- read.csv(fname, stringsAsFactors=FALSE)

# reduce to selected vocab size
topics <- topics[1:no.words,]    

# loop through topics and calculate semantic coherence for each topic
# data frame to hold coherence value for each topic
out <- data.frame(topic = seq(1,topic.model,1), coherence = NA)

for (topic.number in 1:topic.model) {

    # Print current status
    cat(paste("...topic",topic.number))
    
    # lookup word ID numbers
    id <- words[topics[,topic.number],]$id

    for (i in 2:no.words) {
        for (j in 1:(i-1)) {
            # create TRUE/FALSE vectors
            a <- grepl(paste0(id[i],":"), docs)
            b <- grepl(paste0(id[j],":"), docs)

            # calculate function
            subpart <- log((sum(a==TRUE & b==TRUE) + 1) / sum(b))
            if (is.na(out$coherence[topic.number])) {
                out$coherence[topic.number] <- subpart
            } else {
                out$coherence[topic.number] <- out$coherence[topic.number] + subpart}
        }
    }

    # save results (do after each topic loop to preserve output if script stops)
    write.csv(out, file=output.file, row.names=FALSE)    
}


